library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2     ✓ purrr   0.3.4
✓ tibble  3.0.1     ✓ dplyr   1.0.0
✓ tidyr   1.1.0     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
package ‘ggplot2’ was built under R version 3.6.2package ‘tibble’ was built under R version 3.6.2package ‘tidyr’ was built under R version 3.6.2package ‘purrr’ was built under R version 3.6.2package ‘dplyr’ was built under R version 3.6.2── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(dplyr)

1.1 Load the diamonds.csv data set and undertake an initial exploration of the data. You will find a description of the meanings of the variables on the relevant Kaggle page

diamonds <- read_csv("diamonds.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  X1 = col_double(),
  carat = col_double(),
  cut = col_character(),
  color = col_character(),
  clarity = col_character(),
  depth = col_double(),
  table = col_double(),
  price = col_double(),
  x = col_double(),
  y = col_double(),
  z = col_double()
)
diamonds
summary(diamonds)
       X1            carat            cut               color             clarity              depth           table           price      
 Min.   :    1   Min.   :0.2000   Length:53940       Length:53940       Length:53940       Min.   :43.00   Min.   :43.00   Min.   :  326  
 1st Qu.:13486   1st Qu.:0.4000   Class :character   Class :character   Class :character   1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950  
 Median :26970   Median :0.7000   Mode  :character   Mode  :character   Mode  :character   Median :61.80   Median :57.00   Median : 2401  
 Mean   :26970   Mean   :0.7979                                                            Mean   :61.75   Mean   :57.46   Mean   : 3933  
 3rd Qu.:40455   3rd Qu.:1.0400                                                            3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324  
 Max.   :53940   Max.   :5.0100                                                            Max.   :79.00   Max.   :95.00   Max.   :18823  
       x                y                z         
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
 1st Qu.: 4.710   1st Qu.: 4.720   1st Qu.: 2.910  
 Median : 5.700   Median : 5.710   Median : 3.530  
 Mean   : 5.731   Mean   : 5.735   Mean   : 3.539  
 3rd Qu.: 6.540   3rd Qu.: 6.540   3rd Qu.: 4.040  
 Max.   :10.740   Max.   :58.900   Max.   :31.800  
library(ggiraphExtra)
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
library(GGally)

1.2 We expect the carat of the diamonds to be strong correlated with the physical dimensions x, y and z. Use ggpairs() to investigate correlations between these four variables.

ggpairs(diamonds)

1.3 So, we do find significant correlations. Let’s drop columns x, y and z from the dataset, in preparation to use only carat going forward.

diamonds_xyz <- diamonds %>%
  select(-x, -y, - z)
diamonds_xyz
NA

1.4 We are interested in developing a regression model for the price of a diamond in terms of the possible predictor variables in the dataset. i. Use ggpairs() to investigate correlations between price and the predictors (this may take a while to run, don’t worry, make coffee or something).

ggpairs(diamonds_xyz)

1.4 i. Perform further ggplot visualisations of any significant correlations you find.

diamonds_xyz %>%
  ggplot(aes(y = carat)) +
  geom_boxplot()

plot_diamonds <- diamonds_xyz %>%
  ggplot(aes(x = color, y = carat)) +
  geom_point()
plot_diamonds

plot_diamonds <- diamonds_xyz %>%
  ggplot(aes(x = price, y = carat)) +
  geom_point()
plot_diamonds

1.5 Shortly we may try a regression fit using one or more of the categorical predictors cut, clarity and color, so let’s investigate these predictors: i. Investigate the factor levels of these predictors. How many dummy variables do you expect for each of them?

library(fastDummies)

Use the dummy_cols() function in the fastDummies package to generate dummies for these predictors and check the number of dummies in each case.

diamonds_xyz_dummy <- diamonds_xyz %>%
  dummy_cols(select_columns = "carat", remove_first_dummy = TRUE, remove_selected_columns = TRUE)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
diamonds_xyz_dummy
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpgYGAKCjEuMSBMb2FkIHRoZSBkaWFtb25kcy5jc3YgZGF0YSBzZXQgYW5kIHVuZGVydGFrZSBhbiBpbml0aWFsIGV4cGxvcmF0aW9uIG9mIHRoZSBkYXRhLiBZb3Ugd2lsbCBmaW5kIGEgZGVzY3JpcHRpb24gb2YgdGhlIG1lYW5pbmdzIG9mIHRoZSB2YXJpYWJsZXMgb24gdGhlIHJlbGV2YW50IEthZ2dsZSBwYWdlCmBgYHtyfQpkaWFtb25kcyA8LSByZWFkX2NzdigiZGlhbW9uZHMuY3N2IikKZGlhbW9uZHMKYGBgCgpgYGB7cn0Kc3VtbWFyeShkaWFtb25kcykKYGBgCgpgYGB7cn0KbGlicmFyeShnZ2lyYXBoRXh0cmEpCmxpYnJhcnkoR0dhbGx5KQpgYGAKCjEuMiBXZSBleHBlY3QgdGhlIGNhcmF0IG9mIHRoZSBkaWFtb25kcyB0byBiZSBzdHJvbmcgY29ycmVsYXRlZCB3aXRoIHRoZSBwaHlzaWNhbCBkaW1lbnNpb25zIHgsIHkgYW5kIHouIFVzZSBnZ3BhaXJzKCkgdG8gaW52ZXN0aWdhdGUgY29ycmVsYXRpb25zIGJldHdlZW4gdGhlc2UgZm91ciB2YXJpYWJsZXMuCmBgYHtyfQpnZ3BhaXJzKGRpYW1vbmRzKQpgYGAKCjEuMyBTbywgd2UgZG8gZmluZCBzaWduaWZpY2FudCBjb3JyZWxhdGlvbnMuIExldOKAmXMgZHJvcCBjb2x1bW5zIHgsIHkgYW5kIHogZnJvbSB0aGUgZGF0YXNldCwgaW4gcHJlcGFyYXRpb24gdG8gdXNlIG9ubHkgY2FyYXQgZ29pbmcgZm9yd2FyZC4KCmBgYHtyfQpkaWFtb25kc194eXogPC0gZGlhbW9uZHMgJT4lCiAgc2VsZWN0KC14LCAteSwgLSB6KQpkaWFtb25kc194eXoKCmBgYAoKMS40IFdlIGFyZSBpbnRlcmVzdGVkIGluIGRldmVsb3BpbmcgYSByZWdyZXNzaW9uIG1vZGVsIGZvciB0aGUgcHJpY2Ugb2YgYSBkaWFtb25kIGluIHRlcm1zIG9mIHRoZSBwb3NzaWJsZSBwcmVkaWN0b3IgdmFyaWFibGVzIGluIHRoZSBkYXRhc2V0LgppLiBVc2UgZ2dwYWlycygpIHRvIGludmVzdGlnYXRlIGNvcnJlbGF0aW9ucyBiZXR3ZWVuIHByaWNlIGFuZCB0aGUgcHJlZGljdG9ycyAodGhpcyBtYXkgdGFrZSBhIHdoaWxlIHRvIHJ1biwgZG9u4oCZdCB3b3JyeSwgbWFrZSBjb2ZmZWUgb3Igc29tZXRoaW5nKS4KYGBge3J9CmdncGFpcnMoZGlhbW9uZHNfeHl6KQoKYGBgCgoxLjQgaS4gUGVyZm9ybSBmdXJ0aGVyIGdncGxvdCB2aXN1YWxpc2F0aW9ucyBvZiBhbnkgc2lnbmlmaWNhbnQgY29ycmVsYXRpb25zIHlvdSBmaW5kLgpgYGB7cn0KZGlhbW9uZHNfeHl6ICU+JQogIGdncGxvdChhZXMoeSA9IGNhcmF0KSkgKwogIGdlb21fYm94cGxvdCgpCmBgYAoKYGBge3J9CnBsb3RfZGlhbW9uZHMgPC0gZGlhbW9uZHNfeHl6ICU+JQogIGdncGxvdChhZXMoeCA9IGNvbG9yLCB5ID0gY2FyYXQpKSArCiAgZ2VvbV9wb2ludCgpCnBsb3RfZGlhbW9uZHMKYGBgCgpgYGB7cn0KcGxvdF9kaWFtb25kcyA8LSBkaWFtb25kc194eXogJT4lCiAgZ2dwbG90KGFlcyh4ID0gcHJpY2UsIHkgPSBjYXJhdCkpICsKICBnZW9tX3BvaW50KCkKcGxvdF9kaWFtb25kcwpgYGAKCjEuNSBTaG9ydGx5IHdlIG1heSB0cnkgYSByZWdyZXNzaW9uIGZpdCB1c2luZyBvbmUgb3IgbW9yZSBvZiB0aGUgY2F0ZWdvcmljYWwgcHJlZGljdG9ycyBjdXQsIGNsYXJpdHkgYW5kIGNvbG9yLCBzbyBsZXTigJlzIGludmVzdGlnYXRlIHRoZXNlIHByZWRpY3RvcnM6CmkuIEludmVzdGlnYXRlIHRoZSBmYWN0b3IgbGV2ZWxzIG9mIHRoZXNlIHByZWRpY3RvcnMuIEhvdyBtYW55IGR1bW15IHZhcmlhYmxlcyBkbyB5b3UgZXhwZWN0IGZvciBlYWNoIG9mIHRoZW0/CmBgYHtyfQoKYGBgCgpgYGB7cn0KbGlicmFyeShmYXN0RHVtbWllcykKYGBgCgpVc2UgdGhlIGR1bW15X2NvbHMoKSBmdW5jdGlvbiBpbiB0aGUgZmFzdER1bW1pZXMgcGFja2FnZSB0byBnZW5lcmF0ZSBkdW1taWVzIGZvciB0aGVzZSBwcmVkaWN0b3JzIGFuZCBjaGVjayB0aGUgbnVtYmVyIG9mIGR1bW1pZXMgaW4gZWFjaCBjYXNlLgpgYGB7cn0KZGlhbW9uZHNfeHl6X2R1bW15IDwtIGRpYW1vbmRzX3h5eiAlPiUKICBkdW1teV9jb2xzKHNlbGVjdF9jb2x1bW5zID0gImNhcmF0IiwgcmVtb3ZlX2ZpcnN0X2R1bW15ID0gVFJVRSwgcmVtb3ZlX3NlbGVjdGVkX2NvbHVtbnMgPSBUUlVFKQpkaWFtb25kc194eXpfZHVtbXkKYGBgCgoKCgoKCgoKCgo=